
amphiData <- read.csv(file = "./Data/Amphibians_in_trade.csv",
                      stringsAsFactors = FALSE)

library(xml2)
library(rvest)
library(dplyr)

# Function to query the AMNH database -------------------------------------

# Frost, Darrel R. 2020. Amphibian Species of the World: an Online Reference.
# Version 6.1 (Date of access). Electronic Database accessible at
# https://amphibiansoftheworld.amnh.org/index.php. American Museum of Natural
# History, New York, USA. doi.org/10.5531/db.vz.0001

# it will retrieve the first search result from a species query
# sp <- "Arthroleptis discodactylus"
# sp <- "Leptopelis aubryi"
get_AMNH_authority <- function(sp = NA){
  
  if(is.na(sp)){
    stop("Species name must be provided")
  }
  cat(paste0(sp, " | "))
  
  spSearch <- sub(" ", "+", sp)
  
  url <- paste0("https://amphibiansoftheworld.amnh.org/amphib/basic_search?basic_query=",
                spSearch,
                "&stree=&stree_id=")
  
  htmlData <- read_html(url,
                        encoding = "UTF-8")
  
  speciesResults <- htmlData %>% 
    html_nodes(".title") %>% 
    html_text()# %>% 
    # first()
  
  # if there are no species matches, return the top taxon in the search
  if(!length(speciesResults[str_detect(speciesResults, sp)]) == 0){
    # gets the first match that has the species name within it
    speciesResults <- speciesResults[str_detect(speciesResults, sp)][1]
  } else {
    speciesResults <- speciesResults[1]
  }
  
  speciesAuth <- trimws(gsub("\n", "", speciesResults))
  return(speciesAuth)
}


# Apply the function to all species ---------------------------------------

### ACCESSED 2020-10-02

# 2 second delay to reduce server load

## running it all at once is vulnerable to power outages
# speciesAuthorities <- sapply(amphiData$amphiName, function(x){
#   spAuth <- get_AMNH_authority(x)
#   Sys.sleep(2)
#   return(spAuth)
# })

# created a new column initially that can be gradually in-filled
# amphiData$speciesAuthority <- NA
for(row in 1:dim(amphiData)[1]){
  if(is.na(amphiData[row,"speciesAuthority"])){
    amphiData[row,"speciesAuthority"] <- get_AMNH_authority(amphiData[row,"amphiName"])
    # Sys.sleep(2)
  } else {
    print(" skipped ")
    {next}
  }
}

# check completion, should have zero NAs
sum(is.na(amphiData$speciesAuthority))

library(stringr)

# check for search results that return different names - 529 issues
# clearly many that are simple genus swaps
probSpp <- amphiData %>% 
  mutate(amnhName = word(gsub("\"", "", speciesAuthority), 1, 2)) %>% 
  filter(!amphiName == amnhName) %>% 
  select(amphiName, amnhName)

# if the genus is different but the species epithet is the same we can assume
# the authority is correct, so let's invert that selection to leave us with the
# problem species
probSpp <- probSpp %>% 
  filter( !( (!word(amphiName, 1, 1) == word(amnhName, 1, 1)) &
              (word(amphiName, 2, 2) == word(amnhName, 2, 2)) ) ) %>% 
  select(amphiName, amnhName)
# 174 problem species
# some appear to be minor changes to the species epithet
library(similiars)
# using the similiars package we can filter out those that have species epiphets
# very similar to the amphiName species
listSpeciesDist <- apply(probSpp, 1, function(x){
  strDist <- find_string_distance(word(x["amphiName"], 2, 2),
                       word(x["amnhName"], 2, 2))
  strDist[[1]][,3]
})
probSpp$spEpiDist <- unlist(listSpeciesDist)

# we can set a cut-off of 4 character difference, anything worse than 3
# characters off needs checking
problemAuthSpp <- probSpp %>% 
  filter(!spEpiDist < 4) %>% 
  pull(amphiName)
# now down to 120 species with problematic authorities  

# there are 12 species traded with problematic authorities
amphiData %>% 
  filter(anyTraded, amphiName %in% problemAuthSpp) %>% 
  select(amphiName, speciesAuthority)

# check and fix those odd names
amphiData$speciesAuthority[amphiData$amphiName %in% problemAuthSpp & amphiData$anyTraded] <- 
  c("Werner, 1894",
    "Peters, 1873",
    "Peters, 1873",
    "Schmidt, 1857",
    "Schneider, 1799",
    "Günther, 1859",
    "Boulenger, 1905",
    "Günther, 1867",
    "Grandidier, 1872",
    "Angel, 1930",
    "Ahl, 1925",
    "Linnaeus, 1758",
    "Crochet, Dubois, Ohler & Tunner, 1995")

# extract the description years
amphiData <- amphiData %>% 
  mutate(descYear = as.numeric(sub('.*(\\d{4}).*', '\\1', speciesAuthority)))

write.csv(x = amphiData, file = "./Data/Amphibians_in_trade.csv",
          row.names = FALSE)

